temp_folder='/home/kate/Research/Property/Notebooks/Experiments/tmp/'
#Experiment_name must NOT contain underscore (_)
Experiment_name='BasicPoisson'
#Experiments log file
Experiments_file='/home/kate/Research/Property/Notebooks/Experiments/Logs/Set1-Poisson.xlsx'
#AllExperiments_tab is a table with a list of all experiments included in the log
#Mandatory columns: Experiment (Experiment_name), Dataset(data file name), Target(target column name from Dataset)
#The rest of the columns are not use in the code below. I usually add in a free form: objective,status,result,notebook name used to conduct the experiment
AllExperiments_tab='Experiments'
#Experiment configuration:
#1.Experiment_Features_tab: differenet datasets to try
#each line in the tab contains a model name and set of features to built a dataset for SageMaker
#a feature can be an exact column name from the Dataset column in AllExperiments_tab or a calculation based on exact column names and eval pandas function
#if the experiment objective is to try different parameters sets, all models (if more then 1) can have the same feature sets.
Experiment_Features_tab='%s Features'%Experiment_name
#2. Alternatively a set of data files with preprocessed data in S3 can be provided in a form:
#Model,Training_data,Validation_data[, Testing_data, Testing_labels]
Experiment_InputData_tab='%s InputData'%Experiment_name
#3. Experiment_Params_tab: each line in the tab contains a model name and set of XGBoost parametersto apply to a model
#the set of models should be consistent in Experiment_Features_tab and Experiment_Params_tab
#parameters can be the same for all models or specific in each model
Experiment_Params_tab='%s Params'%Experiment_name
path_to_data='/home/kate/Research/Property/Data/'
path_to_models='/home/kate/Research/Property/Models/Experiments/%s/'%Experiment_name
path_to_training_data='/home/kate/Research/Property/Data/Experiments/%s/training/'%Experiment_name
path_to_testing_data='/home/kate/Research/Property/Data/Experiments/%s/testing/'%Experiment_name
#preprocessing parameters - the year to separate test data
split_year=2019
#number of folds for CV
num_folds=10
#level of details returning from CV
#any Y return models from a best iteration
#FeatureImportance Y/N
GetFIFlg='Y'
#Scores for Test data (should be provided in fit "test" input) Y/N
GetTestScoreFlg='Y'
#Prediction of Test data (should be provided in fit "test" input) Y/N
GetTestPredFlg='Y'
score='poisson-nloglik' #'gini'
#Significance level for t-test
alpha=0.05
#n2/n1 (validation/training) ratio for corrected t-test if n2=n1 or n2/n1 = 1 then it's just usual Student t-test withoot correction
#10 folds means 1/9 validation/training ratio
n2=1
n1=9
import sys
import time
import os
import re
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle as pkl
#for analyzing results: charts and t-test
import scipy.stats as stats
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
Experiment is configured in an experiment log file (Excel file, in my case, in different tabs)
experiments = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=AllExperiments_tab)
target=experiments[experiments['Experiment']==Experiment_name]['Target'].values[0]
print('Target of models in %s experiment is %s'%(Experiment_name,target))
data_file=experiments[experiments['Experiment']==Experiment_name]['Dataset'].values[0]
print('Datafile used in %s experiment is %s'%(Experiment_name,data_file))
Target of models in BasicPoisson experiment is cova_ic_nc_water Datafile used in BasicPoisson experiment is property_water_claims_non_cat_fs_v5.csv
This configuration will be used to preprocess data and also need to be moved to S3 in csv format for easy reading in a preprocessing script if we use AWS SKLearnProcessor/job/instances
model_features = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_Features_tab)
model_features
| Model | Offset | F1 | F2 | F3 | F4 | F5 | F6 | F7 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | ecy |
| 1 | Log | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | log_ecy |
| 2 | Offset | ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
| 3 | Offsetlog | log_ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
2a.Preprocessed data may already exists in an S3. Experiment configuration can provide the list of files per model. In this case (len(preprocessed_data)==0) the code skips all steps to preprocess data
try:
preprocessed_data = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_InputData_tab)
#preprocessed_data = pd.concat([preprocessed_data,model_features.drop('Model',axis=1)], axis=1)
except:
preprocessed_data = pd.DataFrame()
model_params = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiment_Params_tab)
model_params
| Model | objective | eval_metric | booster | colsample_bylevel | colsample_bytree | eta | subsample | max_depth | num_round | reg_lambda | reg_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | 1 | 0 |
| 1 | Log | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | 1 | 0 |
| 2 | Offset | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | 1 | 0 |
| 3 | Offsetlog | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | 1 | 0 |
4.Verification if we have the same set of models in both configurations
models_from_model_features=model_features['Model'].tolist()
models_from_model_params=model_params['Model'].tolist()
if len([x for x in models_from_model_features if x not in models_from_model_params])!=0:
raise Exception('Different set of models in featuresets and parametersets!')
if len(preprocessed_data)>0:
models_from_preprocessed_data=preprocessed_data['Model'].tolist()
if len([x for x in models_from_preprocessed_data if x not in models_from_model_params])!=0:
raise Exception('Different set of models in input data and parametersets!')
#sys.path.append('/home/kate/Research/YearBuilt/Notebooks/Experiments')
import ExperimentsUtils as eu
Preprocessing output (training and testing datasets) are saved separately for each model in a folder with the same name as a models name configured in the experiment
if len(preprocessed_data)==0:
preprocessed_data = pd.DataFrame(columns=['Model', 'Training_data', 'Testing_data', 'Training_offset','Testing_offset'])
input_data_path=path_to_data+data_file
print('Reading input data from {}'.format(input_data_path))
dataset = pd.read_csv(input_data_path, error_bad_lines=False, index_col=False)
dataset_test=dataset[(dataset.cal_year == split_year)]
dataset=dataset[(dataset.cal_year < split_year)]
#iterating thru config file with models and featureset
feature_columns=model_features.columns.tolist()
feature_columns.remove('Offset')
feature_columns.remove('Model')
feature_columns
for index, row in model_features.iterrows():
model=row['Model']
print (index, ': Creating datasets for model %s'%model)
featureset=row[feature_columns].tolist()
featureset=[x for x in featureset if str(x) != 'nan']
print(','.join(featureset))
#creating dataset for a model according to configured dataset
X = pd.DataFrame()
X_test = pd.DataFrame()
for f in featureset:
X[f]=dataset.eval(f)
X_test[f]=dataset_test.eval(f)
y=dataset.eval(target)
y_test=dataset_test.eval(target)
#Offset is not a mandatory column
offset_flg=False
test_offset_filename=''
train_offset_filename=''
try:
offset_column=row['Offset']
if offset_column != 'nan':
offset_train=dataset.eval(offset_column)
offset_test=dataset_test.eval(offset_column)
offset_flg=True
except:
offset_flg=False
print('Testing data...')
test_dataset=pd.DataFrame({target:y_test}).join(X_test)
test_data_output_path = path_to_testing_data+model
if not os.path.exists(test_data_output_path):
os.makedirs(test_data_output_path)
test_data_filename = os.path.join(test_data_output_path, 'testing_%s.csv'%(model))
test_dataset.to_csv(test_data_filename, header=True, index=False)
if offset_flg:
test_offset_filename = os.path.join(test_data_output_path, 'offset_%s.csv'%(model))
offset_test.to_csv(test_offset_filename, header=True, index=False)
#The rest of the data will be used in cv-fold as a whole and seprated to training/validation insode cv
print('Training data...')
training_dataset=pd.DataFrame({target:y}).join(X)
train_data_output_path=path_to_training_data+model
if not os.path.exists(train_data_output_path):
os.makedirs(train_data_output_path)
train_data_filename = os.path.join(train_data_output_path, 'training_%s.csv'%model)
training_dataset.to_csv(train_data_filename, header=True, index=False)
if offset_flg:
train_offset_filename = os.path.join(train_data_output_path, 'offset_%s.csv'%(model))
offset_train.to_csv(train_offset_filename, header=True, index=False)
preprocessed_data.loc[index]=[model, train_data_filename,test_data_filename,train_offset_filename,test_offset_filename]
#Saving into the Experiment log file names of created training and validation datasets
preprocessed_data = pd.concat([preprocessed_data,model_features.drop('Model',axis=1)], axis=1)
eu.SaveToExperimentLog(Experiments_file, '%s InputData'%Experiment_name, preprocessed_data)
Reading input data from /home/kate/Research/Property/Data/property_water_claims_non_cat_fs_v5.csv 0 : Creating datasets for model BaseModel cal_year-yearbuilt,cova_deductible,sqft,customer_cnt_active_policies,usagetype_encd,water_risk_3_blk,ecy Testing data... Training data... 1 : Creating datasets for model Log cal_year-yearbuilt,cova_deductible,sqft,customer_cnt_active_policies,usagetype_encd,water_risk_3_blk,log_ecy Testing data... Training data... 2 : Creating datasets for model Offset cal_year-yearbuilt,cova_deductible,sqft,customer_cnt_active_policies,usagetype_encd,water_risk_3_blk Testing data... Training data... 3 : Creating datasets for model Offsetlog cal_year-yearbuilt,cova_deductible,sqft,customer_cnt_active_policies,usagetype_encd,water_risk_3_blk Testing data... Training data...
preprocessed_data
| Model | Training_data | Testing_data | Training_offset | Testing_offset | Offset | F1 | F2 | F3 | F4 | F5 | F6 | F7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | ecy | ||
| 1 | Log | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | log_ecy | ||
| 2 | Offset | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
| 3 | Offsetlog | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | log_ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
models_from_preprocessed_data=preprocessed_data['Model'].tolist()
models_from_model_params=model_params['Model'].tolist()
if len([x for x in models_from_preprocessed_data if x not in models_from_model_params])!=0:
raise Exception('Different set of models in preprocessed_data and parametersets!')
#using merge because, in general, we can have different number of rows in each dataframe - folds in data and different sets of params
data_for_training=pd.merge(model_params, preprocessed_data, on='Model', how='inner')
data_for_training
| Model | objective | eval_metric | booster | colsample_bylevel | colsample_bytree | eta | subsample | max_depth | num_round | ... | Training_offset | Testing_offset | Offset | F1 | F2 | F3 | F4 | F5 | F6 | F7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | ... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | ecy | ||
| 1 | Log | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | ... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | log_ecy | ||
| 2 | Offset | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | ... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
| 3 | Offsetlog | count:poisson | poisson-nloglik | gbtree | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 5000 | ... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | log_ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
4 rows × 24 columns
def cv_misc_callback(oof_train_scores:list, oof_valid_scores:list, best_models:list, maximize=True):
"""
It's called inside XGB CV to catch individual folds scores and models
"""
state = {}
def init(env):
if maximize:
state['best_score'] = -np.inf
else:
state['best_score'] = np.inf
#--------------------------------------------------------------------------------
def callback(env):
#init env if empty
if not state:
init(env)
best_score = state['best_score']
score = env.evaluation_result_list[-1][1]
#extract best model if a current score is better then previous
if (maximize and score > best_score) or (not maximize and score < best_score):
for i, cvpack in enumerate(env.cvfolds):
best_models[i]=cvpack.bst
state['best_score'] = score
#all iterations individual folds scores
folds_train_scores = []
folds_valid_scores = []
for i, cvpack in enumerate(env.cvfolds):
scores = cvpack.eval(iteration=0,feval=feval)
#print(scores)
scores_l = re.split(': |\t',scores)
train_score=scores_l[1].rpartition(':')[2]
valid_score=scores_l[2].rpartition(':')[2]
folds_train_scores.append(train_score)
folds_valid_scores.append(valid_score)
oof_train_scores.append(folds_train_scores)
oof_valid_scores.append(folds_valid_scores)
#--------------------------------------------------------------------------------
callback.before_iteration = False
return callback
def gini(y, pred):
g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
g = g[np.lexsort((g[:,2], -1*g[:,1]))]
gs = g[:,0].cumsum().sum() / g[:,0].sum()
gs -= (len(y) + 1) / 2.
return gs / len(y)
def gini_xgb(pred, y):
y = y.get_label()
return 'gini', gini(y, pred) / gini(y, y)
from sklearn.metrics import roc_auc_score
def nLogLik_XGBoost (act,pred):
df = pd.DataFrame({"act":act,"pred":pred})
return np.mean( df.pred - df.act*np.log(df.pred))
#parameters depending on score
#custom evaluation function
feval=gini_xgb if score=='gini' else None
#best model from xgboost CV: with minimum or maximum score
Maximize=False if score=='poisson-nloglik' else True
#Running this cell can be skipped and used the results from the same Python code which was ran in background
#regexpression to exclude features (F1..F25) from the list of parameters
regex = re.compile('F[ 0-9]')
for index, row in data_for_training.iterrows():
model='%s-%s'%(row['Model'],index)
print(model)
#Dataset
train_dataset = pd.read_csv(row['Training_data'], error_bad_lines=False, index_col=False)
X_train = train_dataset.iloc[:,1:]
y_train = train_dataset.iloc[:, 0]
dtrain = xgb.DMatrix(X_train, y_train)
#offset is not mandatory
try:
train_offset = pd.read_csv(row['Training_offset'], error_bad_lines=False, index_col=False)
dtrain.set_base_margin(train_offset.values)
print('training Offset was added')
except:
pass
#Test Dataset
if 'Y' in (GetTestScoreFlg,GetTestPredFlg):
test_dataset = pd.read_csv(row['Testing_data'], error_bad_lines=False, index_col=False)
X_test = test_dataset.iloc[:,1:]
y_test = test_dataset.iloc[:, 0]
dtest = xgb.DMatrix(X_test, y_test)
#offset is not mandatory
try:
testing_offset = pd.read_csv(row['Testing_offset'], error_bad_lines=False, index_col=False)
dtest.set_base_margin(testing_offset.values)
print('testing Offset was added')
except:
pass
#Hyperparameters
hyperparameters = {
'seed': 42
}
for i, param in enumerate(data_for_training.columns):
#skip first column with Model name and dataset names or features
#if do not exclude then they will be added into experiment analytics as parameters but not used in training anyway
if ((param in ('Model','Training_data','Validation_data','Testing_data','Testing_labels','Offset','Training_offset', 'Testing_offset')) | (bool(re.match(regex, param)))):
continue
if param=='num_round':
continue
if ((param=='eval_metric') & (score=='gini')):
hyperparameters['disable_default_eval_metric'] = '1'
continue
hyperparameters[param] = row[param]
print(hyperparameters)
num_boost_round = row['num_round']
early_stopping_rounds = 100
#OUT parameters from custom callback function:
#train and valid scores from all folds
oof_train_scores = []
oof_valid_scores = []
#Best Model
best_models=[None]*num_folds
#===========================================================================================================
args = {'params':hyperparameters,
'dtrain':dtrain,
'feval':feval,
'num_boost_round':num_boost_round,
'nfold':num_folds,
'stratified':True,
'shuffle':True,
'early_stopping_rounds':early_stopping_rounds,
'seed':42,
'callbacks':[cv_misc_callback(oof_train_scores, oof_valid_scores,best_models,Maximize), xgb.callback.print_evaluation(period=10)]}
cv_results=xgb.cv(**args)
#===========================================================================================================
#scores to dataframe
df_oof_train_scores = pd.DataFrame.from_records(oof_train_scores).apply(pd.to_numeric)
df_oof_valid_scores = pd.DataFrame.from_records(oof_valid_scores).apply(pd.to_numeric)
#only folds scores columns names
columns = df_oof_train_scores.columns.tolist()
#mean and std, sem
df_oof_train_scores['std'] = df_oof_train_scores[columns].std(axis=1)
df_oof_valid_scores['std'] = df_oof_valid_scores[columns].std(axis=1)
df_oof_train_scores['sem'] = df_oof_train_scores[columns].sem(axis=1)
df_oof_valid_scores['sem'] = df_oof_valid_scores[columns].sem(axis=1)
df_oof_train_scores['mean'] = df_oof_train_scores[columns].mean(axis=1)
df_oof_valid_scores['mean'] = df_oof_valid_scores[columns].mean(axis=1)
#best models feature importance
if GetFIFlg=='Y':
oof_fi_weight_best = {}
oof_fi_gain_best = {}
oof_fi_cover_best = {}
for i in range(0,num_folds):
oof_fi_weight_best[i]=best_models[i].get_score(importance_type='weight')
oof_fi_gain_best[i]= best_models[i].get_score(importance_type='gain')
oof_fi_cover_best[i]= best_models[i].get_score(importance_type='cover')
#converting to dataframe
df_oof_fi_weight_best = pd.DataFrame(oof_fi_weight_best).apply(pd.to_numeric)
df_oof_fi_gain_best = pd.DataFrame(oof_fi_gain_best).apply(pd.to_numeric)
df_oof_fi_cover_best = pd.DataFrame(oof_fi_cover_best).apply(pd.to_numeric)
#mean and std, sem
df_oof_fi_weight_best['std'] = df_oof_fi_weight_best[columns].std(axis=1)
df_oof_fi_gain_best['std'] = df_oof_fi_gain_best[columns].std(axis=1)
df_oof_fi_cover_best['std'] = df_oof_fi_cover_best[columns].std(axis=1)
df_oof_fi_weight_best['sem'] = df_oof_fi_weight_best[columns].sem(axis=1)
df_oof_fi_gain_best['sem'] = df_oof_fi_gain_best[columns].sem(axis=1)
df_oof_fi_cover_best['sem'] = df_oof_fi_cover_best[columns].sem(axis=1)
df_oof_fi_weight_best['mean'] = df_oof_fi_weight_best[columns].mean(axis=1)
df_oof_fi_gain_best['mean'] = df_oof_fi_gain_best[columns].mean(axis=1)
df_oof_fi_cover_best['mean'] = df_oof_fi_cover_best[columns].mean(axis=1)
#feature codes from index to column
df_oof_fi_weight_best.reset_index(level=0, inplace=True)
df_oof_fi_weight_best.columns=['feature'] + columns + ['std','sem','mean']
df_oof_fi_gain_best.reset_index(level=0, inplace=True)
df_oof_fi_gain_best.columns=['feature'] + columns + ['std','sem','mean']
df_oof_fi_cover_best.reset_index(level=0, inplace=True)
df_oof_fi_cover_best.columns=['feature'] + columns + ['std','sem','mean']
if 'Y' in (GetTestScoreFlg,GetTestPredFlg):
#Prediction on test data from folds best models...
df_prediction=pd.DataFrame()
df_prediction['actual']=dtest.get_label()
for i in range(0,num_folds):
df_prediction[i]=best_models[i].predict(dtest)
#Test scores from test prediction
df_scores = pd.DataFrame()
for i in range(0,num_folds):
if score=='gini':
df_scores[i]=[gini(df_prediction['actual'], df_prediction[i])/gini(df_prediction['actual'], df_prediction['actual'])]
elif score=='AUC':
df_scores[i]=[roc_auc_score(df_prediction['actual'], df_prediction[i])]
elif score=='poisson-nloglik':
df_scores[i]=[nLogLik_XGBoost(df_prediction['actual'],df_prediction[i])]
df_scores['std'] = df_scores[columns].std(axis=1)
df_scores['sem'] = df_scores[columns].sem(axis=1)
df_scores['mean'] = df_scores[columns].mean(axis=1)
output_data_dir=path_to_models+model
if not os.path.exists(output_data_dir):
os.makedirs(output_data_dir)
for i in range(0,num_folds):
model_location = os.path.join(output_data_dir , 'model-fold-'+str(i))
pkl.dump(best_models[i], open(model_location, 'wb'))
if GetTestPredFlg=='Y':
predictions_location = os.path.join(output_data_dir, 'test_predictions.csv')
print('Saving test predictions at {}'.format(predictions_location))
df_prediction.to_csv(predictions_location, header=True, index=False)
if GetTestScoreFlg=='Y':
oof_test_scores_location = os.path.join(output_data_dir, 'oof_test_scores.csv')
print('Saving oof_test_scores at {}'.format(oof_test_scores_location))
df_scores.to_csv(oof_test_scores_location, header=True, index=False)
cv_result_location = os.path.join(output_data_dir, 'cv_results.csv')
print('Saving cv results at {}'.format(cv_result_location))
cv_results.to_csv(cv_result_location, header=True, index=False)
oof_train_scores_location = os.path.join(output_data_dir, 'oof_train_scores.csv')
print('Saving oof_train_scores at {}'.format(oof_train_scores_location))
df_oof_train_scores.to_csv(oof_train_scores_location, header=True, index=False)
oof_valid_scores_location = os.path.join(output_data_dir, 'oof_valid_scores.csv')
print('Saving oof_valid_scores at {}'.format(oof_valid_scores_location))
df_oof_valid_scores.to_csv(oof_valid_scores_location, header=True, index=False)
if GetFIFlg=='Y':
oof_fi_weight_best_location = os.path.join(output_data_dir, 'oof_fi_weight_best.csv')
print('Saving oof_fi_weight_best at {}'.format(oof_fi_weight_best_location))
df_oof_fi_weight_best.to_csv(oof_fi_weight_best_location, header=True, index=False)
oof_fi_gain_best_location = os.path.join(output_data_dir, 'oof_fi_gain_best.csv')
print('Saving oof_fi_gain_best at {}'.format(oof_fi_gain_best_location))
df_oof_fi_gain_best.to_csv(oof_fi_gain_best_location, header=True, index=False)
oof_fi_cover_best_location = os.path.join(output_data_dir, 'oof_fi_cover_best.csv')
print('Saving oof_fi_cover_best at {}'.format(oof_fi_cover_best_location))
df_oof_fi_cover_best.to_csv(oof_fi_cover_best_location, header=True, index=False)
This is a separate block. If the above part (training) takes a long time and run from in the background mode, not Python notebook, the results can be run time to time to monitor the process from a notebook.
ModelTrainScores=pd.DataFrame()
ModelTestScores=pd.DataFrame()
ModelValidScores=pd.DataFrame()
BestModelFI_gain=pd.DataFrame()
BestModelFI_weight=pd.DataFrame()
BestModelFI_cover=pd.DataFrame()
ModelFiles = pd.DataFrame(columns=['Model', 'ind','Output Data'])
for index, row in data_for_training.iterrows():
model=row['Model']
ind=index
ModelOutput=path_to_models+'%s-%s'%(model,ind)
ModelFiles.loc[int(ind)]=[model, ind, ModelOutput]
print('Processing %s, %s...'%(model,ind))
if os.path.exists(ModelOutput):
#Training results folds
oof_train_scores_file=os.path.join(ModelOutput, 'oof_train_scores.csv')
oof_train_scores=pd.read_csv(oof_train_scores_file, error_bad_lines=False, index_col=False)
oof_train_scores['Model']=model
oof_train_scores['ind']=int(ind)
ModelTrainScores = pd.concat([ModelTrainScores,oof_train_scores])
#validing results folds
oof_valid_scores_file=os.path.join(ModelOutput, 'oof_valid_scores.csv')
oof_valid_scores=pd.read_csv(oof_valid_scores_file, error_bad_lines=False, index_col=False)
oof_valid_scores['Model']=model
oof_valid_scores['ind']=int(ind)
ModelValidScores = pd.concat([ModelValidScores,oof_valid_scores])
#Test Scores
if (GetTestScoreFlg=='Y'):
oof_test_scores_file=os.path.join(ModelOutput, 'oof_test_scores.csv')
oof_test_scores=pd.read_csv(oof_test_scores_file, error_bad_lines=False, index_col=False)
oof_test_scores['Model']=model
oof_test_scores['ind']=int(ind)
ModelTestScores = pd.concat([ModelTestScores,oof_test_scores])
#FI
if (GetFIFlg=='Y'):
oof_fi_gain_best_file=os.path.join(ModelOutput, 'oof_fi_gain_best.csv')
oof_fi_weight_best_file=os.path.join(ModelOutput, 'oof_fi_weight_best.csv')
oof_fi_cover_best_file=os.path.join(ModelOutput, 'oof_fi_cover_best.csv')
if ((GetFIFlg=='Y') & (os.path.isfile(oof_fi_gain_best_file)) & (os.path.isfile(oof_fi_weight_best_file)) & (os.path.isfile(oof_fi_cover_best_file))):
#FI gain
oof_fi_gain_best=pd.read_csv(oof_fi_gain_best_file, error_bad_lines=False, index_col=False)
oof_fi_gain_best['Model']=model
oof_fi_gain_best['ind']=int(ind)
BestModelFI_gain = pd.concat([BestModelFI_gain,oof_fi_gain_best])
#FI weight
oof_fi_weight_best=pd.read_csv(oof_fi_weight_best_file, error_bad_lines=False, index_col=False)
oof_fi_weight_best['Model']=model
oof_fi_weight_best['ind']=int(ind)
BestModelFI_weight = pd.concat([BestModelFI_weight,oof_fi_weight_best])
#FI cover
oof_fi_cover_best=pd.read_csv(oof_fi_cover_best_file, error_bad_lines=False, index_col=False)
oof_fi_cover_best['Model']=model
oof_fi_cover_best['ind']=int(ind)
BestModelFI_cover = pd.concat([BestModelFI_cover,oof_fi_cover_best])
else:
print('Feature Importance files not found')
else:
print('Files do not exist')
Processing BaseModel, 0... Processing Log, 1... Processing Offset, 2... Processing Offsetlog, 3...
#number of columns with folds scores depends on the number of folds (num_folds) We do not know in advance how many of them exist in the results
folds_columns=[]
folds_train_columns=[]
folds_valid_columns=[]
folds_test_columns=[]
folds_gain_columns=[]
folds_weight_columns=[]
folds_cover_columns=[]
for i in range(0,int(num_folds),1):
folds_columns.append(str(i))
folds_train_columns.append('train-%s-fold'%i)
folds_valid_columns.append('valid-%s-fold'%i)
folds_test_columns.append(str(i))
folds_gain_columns.append('gain-%s'%i)
folds_weight_columns.append('weight-%s'%i)
folds_cover_columns.append('cover-%s'%i)
1.Testing fold scores
ModelTestScores['index']=ModelTestScores['ind']
ModelTestScores = ModelTestScores.set_index('index')
1.Training and validation errors from folds:
#instead of a standard output from XGBoost CV I create an extended version
CVResults=pd.concat([
ModelTrainScores['Model'],
ModelTrainScores['ind'],
ModelTrainScores[folds_columns],
ModelTrainScores['mean'],
ModelTrainScores['std'],
ModelTrainScores['sem'],
ModelValidScores[folds_columns],
ModelValidScores['mean'],
ModelValidScores['std'] ,
ModelValidScores['sem'] ],
axis=1)
CVResults.columns=['Model','ind']+folds_train_columns+['train-%s-mean'%score, 'train-%s-std'%score, 'train-%s-sem'%score]+folds_valid_columns+['valid-%s-mean'%score, 'valid-%s-std'%score, 'valid-%s-sem'%score]
CVResults.tail()
| Model | ind | train-0-fold | train-1-fold | train-2-fold | train-3-fold | train-4-fold | train-5-fold | train-6-fold | train-7-fold | ... | valid-3-fold | valid-4-fold | valid-5-fold | valid-6-fold | valid-7-fold | valid-8-fold | valid-9-fold | valid-poisson-nloglik-mean | valid-poisson-nloglik-std | valid-poisson-nloglik-sem | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2248 | Offsetlog | 3 | 0.035846 | 0.035866 | 0.035804 | 0.035841 | 0.035796 | 0.035823 | 0.035828 | 0.035814 | ... | 0.036289 | 0.036656 | 0.036519 | 0.036348 | 0.036378 | 0.036591 | 0.036484 | 0.036413 | 0.000181 | 0.000057 |
| 2249 | Offsetlog | 3 | 0.035845 | 0.035866 | 0.035804 | 0.035841 | 0.035795 | 0.035823 | 0.035827 | 0.035814 | ... | 0.036289 | 0.036656 | 0.036519 | 0.036348 | 0.036378 | 0.036591 | 0.036484 | 0.036413 | 0.000181 | 0.000057 |
| 2250 | Offsetlog | 3 | 0.035845 | 0.035866 | 0.035803 | 0.035840 | 0.035795 | 0.035823 | 0.035827 | 0.035814 | ... | 0.036289 | 0.036656 | 0.036519 | 0.036348 | 0.036378 | 0.036591 | 0.036484 | 0.036413 | 0.000181 | 0.000057 |
| 2251 | Offsetlog | 3 | 0.035845 | 0.035865 | 0.035803 | 0.035840 | 0.035795 | 0.035823 | 0.035827 | 0.035814 | ... | 0.036289 | 0.036656 | 0.036519 | 0.036348 | 0.036378 | 0.036591 | 0.036484 | 0.036413 | 0.000181 | 0.000057 |
| 2252 | Offsetlog | 3 | 0.035845 | 0.035865 | 0.035803 | 0.035840 | 0.035795 | 0.035823 | 0.035826 | 0.035813 | ... | 0.036289 | 0.036656 | 0.036519 | 0.036348 | 0.036378 | 0.036591 | 0.036484 | 0.036413 | 0.000181 | 0.000057 |
5 rows × 28 columns
BestModelFI = pd.DataFrame()
if (len(BestModelFI_gain)+len(BestModelFI_weight)+len(BestModelFI_cover)>0):
BestModelFI_gain.columns=['feature']+folds_gain_columns+['gain-std', 'gain-sem', 'gain-mean', 'Model', 'ind']
BestModelFI_gain=BestModelFI_gain[['Model', 'ind','feature']+folds_gain_columns+['gain-mean','gain-std', 'gain-sem']]
BestModelFI_weight.columns=['feature']+folds_weight_columns+['weight-std', 'weight-sem', 'weight-mean', 'Model', 'ind']
BestModelFI_weight=BestModelFI_weight[['Model', 'ind','feature']+folds_weight_columns+['weight-mean','weight-std', 'weight-sem']]
BestModelFI_cover.columns=['feature']+folds_cover_columns+['cover-std', 'cover-sem', 'cover-mean', 'Model', 'ind']
BestModelFI_cover=BestModelFI_cover[['Model', 'ind','feature']+folds_cover_columns+['cover-mean','cover-std', 'cover-sem']]
BestModelFI=pd.merge(BestModelFI_gain,
BestModelFI_weight,
on=['Model','ind','feature'], how='inner')
BestModelFI=pd.merge(BestModelFI,
BestModelFI_cover,
on=['Model','ind','feature'], how='inner')
BestModelFI.tail()
| Model | ind | feature | gain-0 | gain-1 | gain-2 | gain-3 | gain-4 | gain-5 | gain-6 | ... | cover-3 | cover-4 | cover-5 | cover-6 | cover-7 | cover-8 | cover-9 | cover-mean | cover-std | cover-sem | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21 | Offsetlog | 3 | cova_deductible | 2.449929 | 2.449118 | 2.432184 | 2.409626 | 2.365386 | 2.432055 | 2.525084 | ... | 13689.348146 | 13555.121403 | 13930.804544 | 14368.274198 | 12861.594128 | 12680.429478 | 12617.774269 | 13406.686589 | 608.061542 | 192.285943 |
| 22 | Offsetlog | 3 | water_risk_3_blk | 1.885903 | 1.868317 | 1.894430 | 1.847012 | 1.905641 | 1.865274 | 1.861026 | ... | 7298.562505 | 7438.115431 | 7656.384235 | 7367.730619 | 8098.948115 | 7295.369490 | 7640.875415 | 7505.010769 | 245.980943 | 77.786004 |
| 23 | Offsetlog | 3 | customer_cnt_active_policies | 1.916538 | 2.106300 | 1.896785 | 1.831346 | 1.843241 | 2.005979 | 1.953929 | ... | 9511.376125 | 9886.179673 | 9791.445852 | 10251.854552 | 10257.388304 | 10248.923585 | 9714.810470 | 10088.942759 | 448.172909 | 141.724718 |
| 24 | Offsetlog | 3 | usagetype_encd | 2.176185 | 2.237382 | 2.664227 | 2.806717 | 2.662646 | 2.590270 | 2.480208 | ... | 22062.451263 | 20892.846005 | 21005.953626 | 20376.089485 | 20858.113380 | 21485.665082 | 21995.379915 | 20736.942231 | 1156.918339 | 365.849702 |
| 25 | Offsetlog | 3 | sqft | 2.266995 | 2.214731 | 2.268641 | 2.226098 | 2.298115 | 2.346615 | 2.314614 | ... | 10961.375734 | 10809.463387 | 11116.410593 | 11202.780725 | 10847.096068 | 11384.820168 | 10794.308464 | 10916.883304 | 281.960541 | 89.163752 |
5 rows × 42 columns
if len(BestModelFI):
lst_chart_filenames = list()
for index, row in data_for_training.iterrows():
if len(BestModelFI[( (BestModelFI['Model']==row['Model']) & (BestModelFI['ind']==index))])>0:
data=BestModelFI[( (BestModelFI['Model']==row['Model']) & (BestModelFI['ind']==index))].sort_values('gain-mean',ascending=False)
fig, axs = plt.subplots(nrows=1, ncols=3,figsize=(20,5))
fig.suptitle('%s %s'%(row['Model'],index))
fig.subplots_adjust(bottom=0.5)
ax = axs[0]
ax.errorbar(data['feature'], data['gain-mean'], color = 'blue', ecolor='lightgray', elinewidth=3, capsize=0,yerr=data['gain-sem'], fmt='o')
ax.set_title('Gain')
ax.set_xticklabels(data['feature'].values,rotation=90)
ax.grid(axis='both')
data=data.sort_values('weight-mean',ascending=False)
ax = axs[1]
ax.errorbar(data['feature'], data['weight-mean'], color = 'blue', ecolor='lightgray', elinewidth=3, capsize=0,yerr=data['weight-sem'], fmt='o')
ax.set_title('Weight')
ax.set_xticklabels(data['feature'].values,rotation=90)
ax.grid(axis='both')
data=data.sort_values('cover-mean',ascending=False)
ax = axs[2]
ax.errorbar(data['feature'], data['cover-mean'], color = 'blue', ecolor='lightgray', elinewidth=3, capsize=0,yerr=data['weight-sem'], fmt='o')
ax.set_title('Cover')
ax.set_xticklabels(data['feature'].values,rotation=90)
ax.grid(axis='both')
chart_filename=temp_folder+'%s %s.png'%(row['Model'],index)
lst_chart_filenames.append(chart_filename)
fig.savefig(chart_filename,format='png')
if len(BestModelFI):
#Saving into the Experiment log file models results
eu.SaveToExperimentLog(Experiments_file, '%s FI'%Experiment_name, BestModelFI)
eu.SaveChartToExperimentLog(Experiments_file, '%s FI'%Experiment_name, len(BestModelFI), 20, lst_chart_filenames)
3.Visualization aggregated from folds best models scores
BestResults=pd.DataFrame()
for index, row in data_for_training.iterrows():
if len(CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index))])>0:
#max or mean depending on the score
BestTestScore=CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index))]['valid-%s-mean'%score].min()
#even if there are more then 1 rows with the same max valid-...-mean use only first: head(1)
BestModelResult=CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index) &
(CVResults['valid-%s-mean'%score]==BestTestScore))].head(1).copy()
BestModelResult['TotalIterations']=CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index))].shape[0]
BestResults=pd.concat([BestResults,BestModelResult])
BestResults.reset_index(inplace=True)
BestResults.columns=['BestIteration','Model', 'ind']+folds_train_columns+['train-%s-mean'%score, 'train-%s-std'%score, 'train-%s-sem'%score]+folds_valid_columns+['valid-%s-mean'%score, 'valid-%s-std'%score, 'valid-%s-sem'%score,'TotalIterations']
BestResults=BestResults[['Model', 'ind','TotalIterations','BestIteration']+folds_train_columns+['train-%s-mean'%score, 'train-%s-std'%score, 'train-%s-sem'%score]+folds_valid_columns+['valid-%s-mean'%score, 'valid-%s-std'%score, 'valid-%s-sem'%score]]
BestResults
| Model | ind | TotalIterations | BestIteration | train-0-fold | train-1-fold | train-2-fold | train-3-fold | train-4-fold | train-5-fold | ... | valid-3-fold | valid-4-fold | valid-5-fold | valid-6-fold | valid-7-fold | valid-8-fold | valid-9-fold | valid-poisson-nloglik-mean | valid-poisson-nloglik-std | valid-poisson-nloglik-sem | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | 0 | 2393 | 2292 | 0.035686 | 0.035704 | 0.035645 | 0.035674 | 0.035635 | 0.035662 | ... | 0.036345 | 0.036659 | 0.036534 | 0.036381 | 0.036378 | 0.036602 | 0.036501 | 0.036429 | 0.000177 | 0.000056 |
| 1 | Log | 1 | 2393 | 2292 | 0.035686 | 0.035704 | 0.035645 | 0.035674 | 0.035635 | 0.035662 | ... | 0.036345 | 0.036659 | 0.036534 | 0.036381 | 0.036378 | 0.036602 | 0.036502 | 0.036430 | 0.000177 | 0.000056 |
| 2 | Offset | 2 | 2534 | 2433 | 0.036250 | 0.036261 | 0.036211 | 0.036242 | 0.036210 | 0.036214 | ... | 0.036681 | 0.036905 | 0.036938 | 0.036769 | 0.036763 | 0.036960 | 0.036908 | 0.036788 | 0.000162 | 0.000051 |
| 3 | Offsetlog | 3 | 2253 | 2152 | 0.035873 | 0.035892 | 0.035829 | 0.035867 | 0.035822 | 0.035848 | ... | 0.036292 | 0.036655 | 0.036516 | 0.036348 | 0.036375 | 0.036585 | 0.036483 | 0.036412 | 0.000180 | 0.000057 |
4 rows × 30 columns
#Individual model scores means and standard errors
#For AUC ony
#Excluding from chart models which did not learn anything (0.5 is random guessing)
#data = BestResults[BestResults['valid-auc-mean']>0.5].copy()
#if len(ModelTestScores)>0:
# data_test = ModelTestScores[ModelTestScores['mean']>0.5].copy()
data = BestResults.copy()
if len(ModelTestScores)>0:
data_test = ModelTestScores.copy()
#list of models for xticks
data['xticks']=data['Model']+' '+data['ind'].astype(str)
xticks=data['xticks'].unique().tolist()
# The x position
r1 = np.arange(len(data))
if len(ModelTestScores)>0:
fig, axs = plt.subplots(nrows=3, ncols=1, sharex=True,figsize=(20,10))
else:
fig, axs = plt.subplots(nrows=2, ncols=1, sharex=True,figsize=(20,10))
ax = axs[0]
ax.errorbar(r1, data['valid-%s-mean'%score], color = 'cyan', ecolor='lightgray', elinewidth=3, capsize=0,yerr=data['valid-%s-sem'%score], fmt='o')
ax.set_title('valid-%s-mean'%score)
ax.grid(axis='both')
#ax.margins(x=2)
ax = axs[1]
ax.errorbar(r1, data['train-%s-mean'%score], color = 'blue', ecolor='lightgray', elinewidth=3,capsize=0, yerr=data['train-%s-sem'%score], fmt='o')
ax.set_title('train-%s-mean'%score)
ax.set_xticks([r for r in range(len(data))])
ax.set_xticklabels(xticks,rotation=90)
ax.grid(axis='both')
#ax.margins(x=2)
fig.suptitle('Means of %s with standard error of the mean'%score)
if len(data_test)>0:
ax = axs[2]
ax.errorbar(r1, data_test['mean'], color = 'green', ecolor='lightgray', elinewidth=3,capsize=0, yerr=data_test['sem'], fmt='o')
ax.set_title('test-%s-mean'%score)
ax.set_xticks([r for r in range(len(data_test))])
ax.set_xticklabels(xticks,rotation=90)
ax.grid(axis='both')
#ax.margins(x=2)
lst_model_scores_chart_filenames=list()
chart_filename=temp_folder+'Models Scores.png'
lst_model_scores_chart_filenames.append(chart_filename)
fig.savefig(chart_filename,format='png')
#set a specific BaseModel name and index or just select with min or max score
#The rest of the models will be compared to baseModel and baseind
BaseModel='BaseModel'
BaseInd=0
#BaseModel=BestResults[BestResults['valid-%s-mean'%score]==BestResults['valid-%s-mean'%score].max()]['Model'].values[0]
#BaseInd=BestResults[BestResults['valid-%s-mean'%score]==BestResults['valid-%s-mean'%score].max()]['ind'].values[0]
BaseModelResults=BestResults[((BestResults['Model']==BaseModel) & (BestResults['ind']==BaseInd))][folds_valid_columns].values[0].tolist()
shapiro_test = stats.shapiro(BaseModelResults)
if shapiro_test.pvalue < alpha:
BestResults['corrected t-test Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
BestResults['t-test Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
else:
BestResults['corrected t-test Comment'] = 'The data are normally distributed'
BestResults['t-test Comment'] = 'The data are normally distributed'
#corrected t-test for each record in BestResults
for index, model in BestResults.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_valid_columns].values.tolist()
shapiro_test = stats.shapiro(AnalyzedModelResults)
if shapiro_test.pvalue < alpha:
BestResults.at[index,'corrected t-test Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
else:
(t, critical_value, pvalue) = eu.corrected_paired_ttest(BaseModelResults,AnalyzedModelResults, n1, n2, alpha)
BestResults.at[index,'corrected t-statistic']= t
BestResults.at[index,'corrected pvalue'] = pvalue
if pvalue>=alpha:
BestResults.at[index,'corrected t-test Comment'] = 'No difference with %s with %s significance level'%(BaseModel,alpha)
else:
BestResults.at[index,'corrected t-test Comment'] = 'There is a difference with %s with %s significance level'%(BaseModel,alpha)
BestResults[['Model','ind','valid-%s-mean'%score,'corrected t-statistic','corrected pvalue','corrected t-test Comment']]
| Model | ind | valid-poisson-nloglik-mean | corrected t-statistic | corrected pvalue | corrected t-test Comment | |
|---|---|---|---|---|---|---|
| 0 | BaseModel | 0 | 0.036429 | NaN | NaN | The data are normally distributed |
| 1 | Log | 1 | 0.036430 | 0.688247 | 5.086465e-01 | No difference with BaseModel with 0.05 signifi... |
| 2 | Offset | 2 | 0.036788 | 14.957988 | 1.155917e-07 | There is a difference with BaseModel with 0.05... |
| 3 | Offsetlog | 3 | 0.036412 | 2.410309 | 3.922912e-02 | There is a difference with BaseModel with 0.05... |
#joining the results of the experiment with the experiment configuration
BestResults = pd.concat([BestResults, data_for_training.drop('Model',axis=1)], axis=1)
BestResults
| Model | ind | TotalIterations | BestIteration | train-0-fold | train-1-fold | train-2-fold | train-3-fold | train-4-fold | train-5-fold | ... | Training_offset | Testing_offset | Offset | F1 | F2 | F3 | F4 | F5 | F6 | F7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BaseModel | 0 | 2393 | 2292 | 0.035686 | 0.035704 | 0.035645 | 0.035674 | 0.035635 | 0.035662 | ... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | ecy | ||
| 1 | Log | 1 | 2393 | 2292 | 0.035686 | 0.035704 | 0.035645 | 0.035674 | 0.035635 | 0.035662 | ... | NaN | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | log_ecy | ||
| 2 | Offset | 2 | 2534 | 2433 | 0.036250 | 0.036261 | 0.036211 | 0.036242 | 0.036210 | 0.036214 | ... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
| 3 | Offsetlog | 3 | 2253 | 2152 | 0.035873 | 0.035892 | 0.035829 | 0.035867 | 0.035822 | 0.035848 | ... | /home/kate/Research/Property/Data/Experiments/... | /home/kate/Research/Property/Data/Experiments/... | log_ecy | cal_year-yearbuilt | cova_deductible | sqft | customer_cnt_active_policies | usagetype_encd | water_risk_3_blk | NaN |
4 rows × 57 columns
CI_name = list()
CI_mean = list()
CI_lower = list()
CI_upper = list()
for index, model in BestResults.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_valid_columns].values.tolist()
diff=[np.abs(y - x) for y, x in zip(BaseModelResults,AnalyzedModelResults)]
CI=eu.corrected_confidence_interval(BaseModelResults,AnalyzedModelResults, n1, n2, 1-alpha)
CI_name.append(model['Model']+' '+str(model['ind']))
CI_mean.append(np.mean(diff))
CI_lower.append(CI[0])
CI_upper.append(CI[1])
BestResults.at[index,'BaseModel Diff mean'] = np.mean(diff)
BestResults.at[index,'BaseModel Corrected CI lower'] = CI[0]
BestResults.at[index,'BaseModel Corrected CI upper'] = CI[1]
CI_df = pd.DataFrame(list(zip(CI_name, CI_mean, CI_lower, CI_upper)), columns=['Model','mean','lower','upper'])
dim=np.arange(0,CI_df['upper'].max() + CI_df['upper'].max()/10,CI_df['upper'].max()/10)
plt.figure(figsize=(20,10))
for lower,mean,upper,x in zip(CI_df['lower'],CI_df['mean'],CI_df['upper'],range(len(CI_df))):
plt.plot((x,x),(lower,upper),'r_-',markersize=20,color='blue')
plt.plot(x,mean,'ro',color='red')
plt.xticks(range(len(CI_df)),list(CI_df['Model']),rotation=90)
plt.yticks(dim)
plt.grid(axis='both')
#plt.margins(x=2)
_=plt.title('Correcetd Confidence Interval of validation scores differences')
lst_chart_filenames=list()
chart_filename=temp_folder+'Correcetd Confidence Interval of validation scores differences.png'
lst_chart_filenames.append(chart_filename)
plt.savefig(chart_filename,format='png')
#t-test for each record in BestResults
for index, model in BestResults.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_valid_columns].values.tolist()
shapiro_test = stats.shapiro(AnalyzedModelResults)
if shapiro_test.pvalue < alpha:
BestResults.at[index,'corrected t-test Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
else:
t=stats.ttest_rel(BaseModelResults,AnalyzedModelResults)
BestResults.at[index,'t-statistic']= t.statistic
BestResults.at[index,'pvalue'] = t.pvalue
if t.pvalue>=alpha:
BestResults.at[index,'t-test Comment'] = 'No difference with %s with %s alpha'%(BaseModel,alpha)
else:
BestResults.at[index,'t-test Comment'] = 'There is a difference with %s with %s alpha'%(BaseModel,alpha)
BestResults[['Model','ind','valid-%s-mean'%score,'t-statistic','pvalue','t-test Comment']]
| Model | ind | valid-poisson-nloglik-mean | t-statistic | pvalue | t-test Comment | |
|---|---|---|---|---|---|---|
| 0 | BaseModel | 0 | 0.036429 | NaN | NaN | The data are normally distributed |
| 1 | Log | 1 | 0.036430 | -1.000000 | 3.434364e-01 | No difference with BaseModel with 0.05 alpha |
| 2 | Offset | 2 | 0.036788 | -21.733452 | 4.356864e-09 | There is a difference with BaseModel with 0.05... |
| 3 | Offsetlog | 3 | 0.036412 | 2.954351 | 1.610666e-02 | There is a difference with BaseModel with 0.05... |
CI_name = list()
CI_mean = list()
CI_lower = list()
CI_upper = list()
for index, model in BestResults.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_valid_columns].values.tolist()
diff=[np.abs(y - x) for y, x in zip(BaseModelResults,AnalyzedModelResults)]
CI=stats.t.interval(1-alpha, len(diff)-1, loc=np.mean(diff), scale=stats.sem(diff))
CI_name.append(model['Model']+' '+str(model['ind']))
CI_mean.append(np.mean(diff))
CI_lower.append(CI[0])
CI_upper.append(CI[1])
BestResults.at[index,'BaseModel Diff mean'] = np.mean(diff)
BestResults.at[index,'BaseModel Corrected CI lower'] = CI[0]
BestResults.at[index,'BaseModel Corrected CI upper'] = CI[1]
CI_df = pd.DataFrame(list(zip(CI_name, CI_mean, CI_lower, CI_upper)), columns=['Model','mean','lower','upper'])
dim=np.arange(0,CI_df['upper'].max() + CI_df['upper'].max()/10,CI_df['upper'].max()/10)
plt.figure(figsize=(20,10))
for lower,mean,upper,x in zip(CI_df['lower'],CI_df['mean'],CI_df['upper'],range(len(CI_df))):
plt.plot((x,x),(lower,upper),'r_-',markersize=20,color='blue')
plt.plot(x,mean,'ro',color='red')
plt.xticks(range(len(CI_df)),list(CI_df['Model']),rotation=90)
plt.yticks(dim)
plt.grid(axis='both')
#plt.margins(x=2)
_=plt.title('onfidence Interval of validation scores differences')
chart_filename=temp_folder+'Confidence Interval of validation scores differences.png'
lst_chart_filenames.append(chart_filename)
plt.savefig(chart_filename,format='png')
#Saving into the Experiment log file models results
eu.SaveToExperimentLog(Experiments_file, '%s BestResults'%Experiment_name, BestResults)
eu.SaveChartToExperimentLog(Experiments_file, '%s BestResults'%Experiment_name, len(BestResults), 20, lst_chart_filenames)
eu.SaveChartToExperimentLog(Experiments_file, '%s BestResults'%Experiment_name, len(BestResults)+100, 20, lst_model_scores_chart_filenames)
#set a specific BaseModel name and index or just select with min or max score
#The rest of the models will be compared to baseModel and baseind
if len(ModelTestScores)>0:
#BaseModel=ModelTestScores[ModelTestScores['mean']==ModelTestScores['mean'].max()]['Model'].values[0]
#BaseInd=ModelTestScores[ModelTestScores['mean']==ModelTestScores['mean'].max()]['ind'].values[0]
BaseModel='BaseModel'
BaseInd=0
BaseModelResults=ModelTestScores[((ModelTestScores['Model']==BaseModel) & (ModelTestScores['ind']==BaseInd))][folds_test_columns].values[0].tolist()
shapiro_test = stats.shapiro(BaseModelResults)
if shapiro_test.pvalue < alpha:
ModelTestScores['Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
else:
ModelTestScores['Comment'] = 'The data are normally distributed'
#t-test for each record in BestResults
if len(ModelTestScores)>0:
for index, model in ModelTestScores.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_test_columns].values.tolist()
shapiro_test = stats.shapiro(AnalyzedModelResults)
if shapiro_test.pvalue < alpha:
ModelTestScores.at[index,'Comment'] = 'The null hypothesis that the data are normally distributed is rejected'
else:
t=stats.ttest_rel(BaseModelResults,AnalyzedModelResults)
ModelTestScores.at[index,'t-statistic']= t.statistic
ModelTestScores.at[index,'pvalue'] = t.pvalue
if t.pvalue>=alpha:
ModelTestScores.at[index,'Comment'] = 'No difference with %s with %s significance level'%(BaseModel,alpha)
else:
ModelTestScores.at[index,'Comment'] = 'There is a difference with %s with %s significance level'%(BaseModel,alpha)
else:
ModelTestScores.at[index,'t-statistic']= np.nan
ModelTestScores.at[index,'pvalue'] = np.nan
ModelTestScores[['Model','mean','t-statistic','pvalue','Comment']]
| Model | mean | t-statistic | pvalue | Comment | |
|---|---|---|---|---|---|
| index | |||||
| 0 | BaseModel | 0.037700 | NaN | NaN | The data are normally distributed |
| 1 | Log | 0.037700 | 0.991871 | 3.471765e-01 | No difference with BaseModel with 0.05 signifi... |
| 2 | Offset | 0.038113 | -184.900070 | 2.013583e-17 | There is a difference with BaseModel with 0.05... |
| 3 | Offsetlog | 0.037683 | 6.201821 | 1.585189e-04 | There is a difference with BaseModel with 0.05... |
if len(ModelTestScores)>0:
CI_name = list()
CI_mean = list()
CI_lower = list()
CI_upper = list()
for index, model in ModelTestScores.iterrows():
if ((model['Model']!=BaseModel) | (model['ind']!=BaseInd)):
AnalyzedModelResults=model[folds_test_columns].values.tolist()
diff=[np.abs(y - x) for y, x in zip(BaseModelResults,AnalyzedModelResults)]
CI=stats.t.interval(1-alpha, len(diff)-1, loc=np.mean(diff), scale=stats.sem(diff))
CI_name.append(model['Model']+' '+str(model['ind']) )
CI_mean.append(np.mean(diff))
CI_lower.append(CI[0])
CI_upper.append(CI[1])
ModelTestScores.at[index,'BaseModel Diff mean'] = np.mean(diff)
ModelTestScores.at[index,'BaseModel Corrected CI lower'] = CI[0]
ModelTestScores.at[index,'BaseModel Corrected CI upper'] = CI[1]
CI_df = pd.DataFrame(list(zip(CI_name, CI_mean, CI_lower, CI_upper)), columns=['Model','mean','lower','upper'])
if len(ModelTestScores)>0:
plt.figure(figsize=(20,10))
dim=np.arange(0,CI_df['upper'].max() + CI_df['upper'].max()/10,CI_df['upper'].max()/10)
for lower,mean,upper,x in zip(CI_df['lower'],CI_df['mean'],CI_df['upper'],range(len(CI_df))):
plt.plot((x,x),(lower,upper),'r_-',markersize=20,color='blue')
plt.plot(x,mean,'ro',color='red')
plt.xticks(range(len(CI_df)),list(CI_df['Model']),rotation=90)
plt.yticks(dim)
plt.grid(axis='both')
#plt.margins(x=2)
_=plt.title('Confidence Interval of test scores differences')
lst_chart_filenames=list()
chart_filename=temp_folder+'Confidence Interval of test scores differences.png'
lst_chart_filenames.append(chart_filename)
plt.savefig(chart_filename,format='png')
The difference between the means of model scores for the entire population present in this confidence interval. If there is no difference, then the interval contains zero (0). If zero is NOT in the range of values, the difference is statistically significant.
if len(ModelTestScores)>0:
#Saving into the Experiment log file models results
eu.SaveToExperimentLog(Experiments_file, '%s TestScores '%Experiment_name, ModelTestScores)
eu.SaveChartToExperimentLog(Experiments_file, '%s TestScores '%Experiment_name, len(ModelTestScores), 20, lst_chart_filenames)
lst_chart_filenames=list()
for index, row in data_for_training.iterrows():
if len(CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index))])>0:
data=CVResults[( (CVResults['Model']==row['Model']) & (CVResults['ind']==index))]
ax=data[['train-%s-mean'%score,'valid-%s-mean'%score]].plot(title=row['Model']+'-'+str(index))
ax.fill_between(data.index.values, (data['train-%s-mean'%score].values-data['train-%s-sem'%score].values), (data['train-%s-mean'%score].values + data['train-%s-sem'%score].values), color='b', alpha=.1)
ax.fill_between(data.index.values, (data['valid-%s-mean'%score].values-data['valid-%s-sem'%score].values), (data['valid-%s-mean'%score].values + data['valid-%s-sem'%score].values), color='r', alpha=.1)
chart_filename=temp_folder+'train-valid scores %s-%s.png'%(row['Model'],index)
lst_chart_filenames.append(chart_filename)
ax.figure.savefig(chart_filename,format='png')
#Saving into the Experiment log file models results
eu.SaveToExperimentLog(Experiments_file, '%s CVResults'%Experiment_name, CVResults.tail(10))
eu.SaveChartToExperimentLog(Experiments_file, '%s CVResults'%Experiment_name, 10, 20, lst_chart_filenames)
#Saving models artifacts into the Experiment Log file
eu.SaveToExperimentLog(Experiments_file, '%s ModelFiles'%Experiment_name, ModelFiles)
df=BestResults[['train-%s-mean'%score,'valid-%s-mean'%score,'colsample_bylevel','colsample_bytree','eta','subsample','max_depth','reg_alpha']]
df['diff']=100*(df['train-%s-mean'%score] - df['valid-%s-mean'%score])
df = df.drop('train-%s-mean'%score, 1)
df=df[['colsample_bylevel','colsample_bytree','eta','subsample','max_depth','reg_alpha','diff','valid-%s-mean'%score]]
df
| colsample_bylevel | colsample_bytree | eta | subsample | max_depth | reg_alpha | diff | valid-poisson-nloglik-mean | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 0 | -0.07672 | 0.036429 |
| 1 | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 0 | -0.07673 | 0.036430 |
| 2 | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 0 | -0.05632 | 0.036788 |
| 3 | 0.8 | 0.8 | 0.01 | 0.8 | 6 | 0 | -0.05630 | 0.036412 |
BestResults
import plotly.express as px
fig = px.parallel_coordinates(df, ['colsample_bylevel','colsample_bytree','eta','subsample','max_depth','reg_alpha','diff','valid-%s-mean'%score], color='valid-%s-mean'%score
,color_continuous_scale=px.colors.diverging.Tealrose,
color_continuous_midpoint=0.34)
fig.show()